Load libraries
Load files
Transform the data to generate a dataframe of 40 variables for each technique
# Transform the data
wide_data <- meanKLC %>%
unite("noise_percentage", noise, percentage, sep = "_") %>%
spread(key = noise_percentage, value = kappa_loss)
# View the transformed data
print(wide_data)
## # A tibble: 20 × 133
## technique `0_0` `0_10` `0_100` `0_20` `0_30` `0_40` `0_50` `0_60` `0_70`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bayesglm 0 0 0 0 0 0 0 0 0
## 2 C5.0 0 0 0 0 0 0 0 0 0
## 3 ctree 0 0 0 0 0 0 0 0 0
## 4 fda 0 0 0 0 0 0 0 0 0
## 5 gbm 0 0 0 0 0 0 0 0 0
## 6 gcvEarth 0 0 0 0 0 0 0 0 0
## 7 JRip 0 0 0 0 0 0 0 0 0
## 8 knn 0 0 0 0 0 0 0 0 0
## 9 lvq 0 0 0 0 0 0 0 0 0
## 10 mlpML 0 0 0 0 0 0 0 0 0
## 11 multinom 0 0 0 0 0 0 0 0 0
## 12 naive_bayes 0 0 0 0 0 0 0 0 0
## 13 PART 0 0 0 0 0 0 0 0 0
## 14 rbfDDA 0 0 0 0 0 0 0 0 0
## 15 rda 0 0 0 0 0 0 0 0 0
## 16 rf 0 0 0 0 0 0 0 0 0
## 17 rfRules 0 0 0 0 0 0 0 0 0
## 18 rpart 0 0 0 0 0 0 0 0 0
## 19 simpls 0 0 0 0 0 0 0 0 0
## 20 svmRadial 0 0 0 0 0 0 0 0 0
## # ℹ 123 more variables: `0_80` <dbl>, `0_90` <dbl>, `10_0` <dbl>,
## # `10_10` <dbl>, `10_100` <dbl>, `10_20` <dbl>, `10_30` <dbl>, `10_40` <dbl>,
## # `10_50` <dbl>, `10_60` <dbl>, `10_70` <dbl>, `10_80` <dbl>, `10_90` <dbl>,
## # `100_0` <dbl>, `100_10` <dbl>, `100_100` <dbl>, `100_20` <dbl>,
## # `100_30` <dbl>, `100_40` <dbl>, `100_50` <dbl>, `100_60` <dbl>,
## # `100_70` <dbl>, `100_80` <dbl>, `100_90` <dbl>, `20_0` <dbl>,
## # `20_10` <dbl>, `20_100` <dbl>, `20_20` <dbl>, `20_30` <dbl>, …
Omitted calculating PCA and NbClust given errors in
Model_Heirarchy.Rmd
Perform Heriarchical Clustering
Optimal K determined as 2.
# Perform hierarchical clustering
hclusters <- hclust(distance_matrix, method = "ward.D")
# Cut the tree to get k=4 clusters
k <- 2
clusters <- cutree(hclusters, k = k)
# Print cluster assignments
print(clusters)
## [1] 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 1
# First get unique techniques in the same order as used for clustering
techniques <- wide_data$technique
# Create the mapping dataframe
technique_clusters <- data.frame(
technique = techniques,
cluster = clusters
)
# Create a named vector to map colors to specific clusters
# This ensures consistent color usage across all plots
cluster_colors <- c(
"1" = "#4FB28F", # Green
"2" = "#F65215" # Orange
)
# Save the dendrogram with colored rectangles by cluster
png("../../results/plots/dendogram2.png", width = 4000, height = 3000, res = 600)
plot(hclusters, hang = -1, labels = wide_data$technique,
main = paste("Hierarchical Grouping (k =", k, ")"),
xlab = "Observations", sub = NULL)
# Create colored rectangles with consistent colors per cluster
rect.hclust(hclusters, k = k, border = cluster_colors[as.character(1:k)])
invisible(dev.off())
# Generate silhouette plot with consistent colors
png("../../results/plots/silhouette_t2.png", width = 4000, height = 3000, res = 600)
sil <- silhouette(clusters, dist = distance_matrix)
# Use the same colors for silhouette plot as for dendrogram
plot(sil, col = cluster_colors[as.character(sort(unique(clusters)))],
main = paste("Silhouette Plot (k =", k, ")"))
invisible(dev.off())
# Join cluster assignments with original data
meanKLC_with_clusters <- meanKLC %>%
left_join(technique_clusters, by = "technique")
# Calculate mean kappa loss for each cluster, noise level, and percentage
cluster_means <- meanKLC_with_clusters %>%
group_by(cluster, noise, percentage) %>%
summarize(kappa_loss = round(mean(kappa_loss, na.rm = TRUE), 2), .groups = 'drop')
print(cluster_means)
## # A tibble: 264 × 4
## cluster noise percentage kappa_loss
## <int> <dbl> <dbl> <dbl>
## 1 1 0 0 0
## 2 1 0 10 0
## 3 1 0 20 0
## 4 1 0 30 0
## 5 1 0 40 0
## 6 1 0 50 0
## 7 1 0 60 0
## 8 1 0 70 0
## 9 1 0 80 0
## 10 1 0 90 0
## # ℹ 254 more rows
# Create plots for individual techniques (optional)
for(instance in instances_names) {
# Filter data for the current instance percentage
filtered_data <- subset(meanKLC_with_clusters, percentage == instance)
# Create plot with consistent colors
p1 <- ggplot(filtered_data, aes(x = noise, y = kappa_loss, color = factor(cluster))) +
geom_point() +
geom_line(aes(group = technique)) +
# Use consistent colors based on cluster assignment
scale_color_manual(values = cluster_colors) +
labs(x = "Noise", y = "Kappa Loss", color = "Cluster") +
ggtitle(paste0("Kappa Loss Curves by technique, noise and ", instance, " % of instances altered")) +
theme_bw() +
scale_y_continuous(limits = c(0.0, 0.5), breaks = seq(0, 1, by = 0.1))
# Print plot
print(p1)
}
# Create plots for cluster means
for(instance in instances_names) {
# Filter data for the current instance percentage
filtered_data <- subset(cluster_means, percentage == instance)
# Create plot with consistent colors
p2 <- ggplot(filtered_data, aes(x = noise, y = kappa_loss, color = factor(cluster))) +
geom_point() +
geom_line(aes(group = cluster)) +
# Use consistent colors based on cluster assignment
scale_color_manual(values = cluster_colors) +
labs(x = "Noise", y = "Kappa Loss", color = "Cluster") +
ggtitle(paste0("Kappa Loss Curves by cluster, noise and ", instance, " % of instances altered")) +
theme_bw() +
scale_y_continuous(limits = c(0.0, 0.5), breaks = seq(0, 1, by = 0.1))
# Print plot
print(p2)
}
# Create an empty list to store plots
plot_list <- list()
# Create all plots and store them in the list
for(i in seq_along(instances_names)) {
instance <- instances_names[i]
# Filter data for both techniques and clusters
filtered_tech_data <- subset(meanKLC_with_clusters, percentage == instance)
filtered_cluster_data <- subset(cluster_means, percentage == instance)
# Create combined plot with consistent colors
combined_plot <- ggplot() +
# Add technique lines with colors based on their cluster
geom_line(data = filtered_tech_data,
aes(x = noise, y = kappa_loss, group = technique, color = factor(cluster)),
linetype = "solid", alpha = 0.5) +
geom_point(data = filtered_tech_data,
aes(x = noise, y = kappa_loss, group = technique, color = factor(cluster)),
alpha = 0.5) +
# Add thicker cluster lines to show the averages
geom_line(data = filtered_cluster_data,
aes(x = noise, y = kappa_loss, group = cluster, color = factor(cluster)),
linewidth = 1.5) +
geom_point(data = filtered_cluster_data,
aes(x = noise, y = kappa_loss, group = cluster, color = factor(cluster)),
size = 3) +
# Set the specific color mapping - consistent with other plots
scale_color_manual(name = "Cluster", values = cluster_colors) +
# Customize the plot
scale_y_continuous(limits = c(0.0, 0.5), breaks = seq(0, 1, by = 0.1)) +
labs(x = "Noise",
y = "Kappa Loss",
title = paste0(instance, "% of instances altered")) +
theme_bw() +
theme(legend.position = "right")
# Store plot in list
plot_list[[i]] <- combined_plot
}
# Arrange all plots in a grid using patchwork
if (requireNamespace("patchwork", quietly = TRUE)) {
# Using patchwork
library(patchwork)
combined_grid <- wrap_plots(plot_list, ncol = 1) +
plot_annotation(title = "Kappa Loss Curves by Technique and Cluster")
print(combined_grid)
# Save the grid plot
png(filename = "../../results/plots/cluster_curves_grid2.png",
width = 4000, height = 12000, res = 600)
print(combined_grid)
dev.off()
} else {
# Print plots individually if patchwork is not available
for (p in plot_list) {
print(p)
}
}
## quartz_off_screen
## 2